import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
Gender = ['Male','Female']
df= pd.DataFrame()
for i in range(0,10000):
df.loc[i, 'Age']=random.randint(6,70)
df.loc[i, 'Gender']=random.choice(Gender)
weekdays=random.randint(1,4)
df.loc[i,'PC+TV on weekdays']=weekdays
weekend=random.randint(2,10)
df.loc[i,'PC+TV on weekends']=weekend
tst=(5/7)*(weekdays)+(2/7)*(weekend)
df.loc[i,'ScreenTime']=tst
if tst>2.5:
df.loc[i, 'Screen_time_exposure']='High'
else:
df.loc[i, 'Screen_time_exposure']='Low'
exercise=random.randint(1,3)
df.loc[i,'Exercise']=exercise
df1=df.copy()
print("----------------------Initial dataset-------------------------")
print(df1)
npy_array = df['ScreenTime'].to_numpy()
anomaly_indices = np.random.choice(range(15), size=500, replace=True)
npy_array[anomaly_indices] = np.random.exponential(scale=20, size=500)
tempdf= pd.DataFrame(npy_array,columns =['ScreenTime'])
df['ScreenTime']=tempdf['ScreenTime']
print("One of the attributes has a functional relationship with another attribute. This particular attribute also has 1% to 20% random anomalous data.")
print("------------------------------------First Dataset-----------------------------------------------")
print(df)
df.to_csv('First_Dataset.csv')
plt.subplots(figsize=(10,10))
boxplot = df.boxplot(column=['Age','PC+TV on weekdays', 'PC+TV on weekends', 'ScreenTime','Exercise'], grid=False, rot=45, fontsize=12)
plt.show()
for i in range (0, 10000):
slpt=9-df1.ScreenTime[i]+df1.Exercise[i]
df1.loc[i,'Sleeptime']=slpt
df.loc[i,'Sleeptime']=slpt
if slpt>= 9.5:
slpp="High"
df1.loc[i, "Sleeping_pattern"]= slpp
elif slpt>=6 and slpt<9.5:
slpp="Normal"
df1.loc[i, "Sleeping_pattern"]= slpp
else:
slpp="Low"
df1.loc[i, "Sleeping_pattern"]= slpp
df.loc[i,"Sleeping_pattern"]=slpp
print("One of the attributes has a functional relationship with another attribute. One of the attributes has a functional relationship with two other attributes.They do not have any anomalous data")
print("----------------------------------Second Dataset------------------------------------------------------------")
print(df1)
df1.to_csv('Second_Dataset.csv')
plt.subplots(figsize=(10,10))
boxplot = df1.boxplot(column=['Age','PC+TV on weekdays', 'PC+TV on weekends', 'ScreenTime','Exercise','Sleeptime'], grid=False, rot=45, fontsize=12)
plt.show()
slp_array = df['Sleeptime'].to_numpy()
anomaly_indices = np.random.choice(range(30), size=1000, replace=True)
slp_array[anomaly_indices] = np.random.exponential(scale=30, size=1000)
temp1df= pd.DataFrame(slp_array,columns =['Sleeptime'])
df['Sleeptime']=temp1df['Sleeptime']
print(" One of the attributes has a functional relationship with another attribute. One of the attributes has a functional relationship with two other attributes and alsohas 1% to 20% random anomalous data.")
print("------------------------------------Third Dataset-----------------------------------------------")
print(df)
df.to_csv('Third_Dataset.csv')
plt.subplots(figsize=(10,10))
boxplot = df.boxplot(column=['Age','PC+TV on weekdays', 'PC+TV on weekends', 'ScreenTime','Exercise','Sleeptime'], grid=False, rot=45, fontsize=12)
plt.show()
----------------------Initial dataset-------------------------
Age Gender PC+TV on weekdays PC+TV on weekends ScreenTime \
0 38.0 Male 2.0 4.0 2.571429
1 60.0 Female 3.0 4.0 3.285714
2 45.0 Male 1.0 5.0 2.142857
3 10.0 Male 3.0 6.0 3.857143
4 11.0 Male 2.0 9.0 4.000000
... ... ... ... ... ...
9995 57.0 Female 3.0 4.0 3.285714
9996 33.0 Female 2.0 7.0 3.428571
9997 15.0 Male 1.0 6.0 2.428571
9998 38.0 Female 1.0 8.0 3.000000
9999 27.0 Female 4.0 8.0 5.142857
Screen_time_exposure Exercise
0 High 1.0
1 High 1.0
2 Low 1.0
3 High 1.0
4 High 1.0
... ... ...
9995 High 3.0
9996 High 3.0
9997 Low 3.0
9998 High 2.0
9999 High 1.0
[10000 rows x 7 columns]
One of the attributes has a functional relationship with another attribute. This particular attribute also has 1% to 20% random anomalous data.
------------------------------------First Dataset-----------------------------------------------
Age Gender PC+TV on weekdays PC+TV on weekends ScreenTime \
0 38.0 Male 2.0 4.0 12.260531
1 60.0 Female 3.0 4.0 10.508764
2 45.0 Male 1.0 5.0 17.951749
3 10.0 Male 3.0 6.0 25.047673
4 11.0 Male 2.0 9.0 7.405383
... ... ... ... ... ...
9995 57.0 Female 3.0 4.0 3.285714
9996 33.0 Female 2.0 7.0 3.428571
9997 15.0 Male 1.0 6.0 2.428571
9998 38.0 Female 1.0 8.0 3.000000
9999 27.0 Female 4.0 8.0 5.142857
Screen_time_exposure Exercise
0 High 1.0
1 High 1.0
2 Low 1.0
3 High 1.0
4 High 1.0
... ... ...
9995 High 3.0
9996 High 3.0
9997 Low 3.0
9998 High 2.0
9999 High 1.0
[10000 rows x 7 columns]
One of the attributes has a functional relationship with another attribute. One of the attributes has a functional relationship with two other attributes.They do not have any anomalous data
----------------------------------Second Dataset------------------------------------------------------------
Age Gender PC+TV on weekdays PC+TV on weekends ScreenTime \
0 38.0 Male 2.0 4.0 2.571429
1 60.0 Female 3.0 4.0 3.285714
2 45.0 Male 1.0 5.0 2.142857
3 10.0 Male 3.0 6.0 3.857143
4 11.0 Male 2.0 9.0 4.000000
... ... ... ... ... ...
9995 57.0 Female 3.0 4.0 3.285714
9996 33.0 Female 2.0 7.0 3.428571
9997 15.0 Male 1.0 6.0 2.428571
9998 38.0 Female 1.0 8.0 3.000000
9999 27.0 Female 4.0 8.0 5.142857
Screen_time_exposure Exercise Sleeptime Sleeping_pattern
0 High 1.0 7.428571 Normal
1 High 1.0 6.714286 Normal
2 Low 1.0 7.857143 Normal
3 High 1.0 6.142857 Normal
4 High 1.0 6.000000 Normal
... ... ... ... ...
9995 High 3.0 8.714286 Normal
9996 High 3.0 8.571429 Normal
9997 Low 3.0 9.571429 High
9998 High 2.0 8.000000 Normal
9999 High 1.0 4.857143 Low
[10000 rows x 9 columns]
One of the attributes has a functional relationship with another attribute. One of the attributes has a functional relationship with two other attributes and alsohas 1% to 20% random anomalous data.
------------------------------------Third Dataset-----------------------------------------------
Age Gender PC+TV on weekdays PC+TV on weekends ScreenTime \
0 38.0 Male 2.0 4.0 12.260531
1 60.0 Female 3.0 4.0 10.508764
2 45.0 Male 1.0 5.0 17.951749
3 10.0 Male 3.0 6.0 25.047673
4 11.0 Male 2.0 9.0 7.405383
... ... ... ... ... ...
9995 57.0 Female 3.0 4.0 3.285714
9996 33.0 Female 2.0 7.0 3.428571
9997 15.0 Male 1.0 6.0 2.428571
9998 38.0 Female 1.0 8.0 3.000000
9999 27.0 Female 4.0 8.0 5.142857
Screen_time_exposure Exercise Sleeptime Sleeping_pattern
0 High 1.0 11.891582 Normal
1 High 1.0 19.631779 Normal
2 Low 1.0 59.365894 Normal
3 High 1.0 6.889189 Normal
4 High 1.0 24.970680 Normal
... ... ... ... ...
9995 High 3.0 8.714286 Normal
9996 High 3.0 8.571429 Normal
9997 Low 3.0 9.571429 High
9998 High 2.0 8.000000 Normal
9999 High 1.0 4.857143 Low
[10000 rows x 9 columns]
# These lines import the necessary libraries and modules for
# data manipulation (pandas), numerical computations (numpy), plotting (matplotlib.pyplot), a
# nd file and directory operations (glob and os).
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
### Testing For Stationarity
# This line imports the adfuller function from the statsmodels.tsa.stattools module.
# The adfuller function is used for performing the Augmented Dickey-Fuller test,
# which is a statistical test for checking the stationarity of a time series.
from statsmodels.tsa.stattools import adfuller
# Create subplots with one trace per page using go object of plotly.graph_objs module
#This line imports the go module from the plotly.graph_objs library.
#The go module provides objects and functions for creating interactive plots and visualization
import plotly.graph_objs as go
# Import TimeSeries class from darts library
# This line imports the TimeSeries class from the darts library.
# The darts library is a time series forecasting and modeling library in Python
from darts import TimeSeries
# These lines import the warnings module and set a filter to ignore warning messages.
# This is done to suppress any non-critical warning messages that may arise during the execution of the code
import warnings
warnings.filterwarnings("ignore")
# Import TimeSeries class from darts library
# This line imports the TimeSeries class from the darts library.
# The darts library is a time series forecasting and modeling library in Pytho
!pip install darts
Requirement already satisfied: darts in c:\users\yasmi\anaconda3\lib\site-packages (0.24.0) Requirement already satisfied: matplotlib>=3.3.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (3.7.0) Requirement already satisfied: pytorch-lightning>=1.5.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (2.0.3) Requirement already satisfied: tensorboardX>=2.1 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (2.6) Requirement already satisfied: tqdm>=4.60.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (4.64.1) Requirement already satisfied: shap>=0.40.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (0.41.0) Requirement already satisfied: pmdarima>=1.8.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (2.0.3) Requirement already satisfied: pyod>=0.9.5 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.0.9) Requirement already satisfied: scikit-learn>=1.0.1 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.2.1) Requirement already satisfied: torch>=1.8.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.12.1) Requirement already satisfied: numpy>=1.19.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.23.5) Requirement already satisfied: requests>=2.22.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (2.28.1) Requirement already satisfied: prophet>=1.1.1 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.1.4) Requirement already satisfied: joblib>=0.16.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.1.1) Requirement already satisfied: holidays>=0.11.1 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (0.26) Requirement already satisfied: scipy>=1.3.2 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.10.0) Requirement already satisfied: xgboost>=1.6.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.7.5) Requirement already satisfied: nfoursid>=1.0.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.0.1) Requirement already satisfied: statsmodels>=0.13.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (0.13.5) Requirement already satisfied: lightgbm>=3.2.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (3.3.5) Requirement already satisfied: tbats>=1.1.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.1.3) Requirement already satisfied: statsforecast>=1.4 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.5.0) Requirement already satisfied: catboost>=1.0.6 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.2) Requirement already satisfied: pandas>=1.0.5 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.5.3) Requirement already satisfied: xarray>=0.17.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (2022.11.0) Requirement already satisfied: plotly in c:\users\yasmi\anaconda3\lib\site-packages (from catboost>=1.0.6->darts) (5.9.0) Requirement already satisfied: six in c:\users\yasmi\anaconda3\lib\site-packages (from catboost>=1.0.6->darts) (1.16.0) Requirement already satisfied: graphviz in c:\users\yasmi\anaconda3\lib\site-packages (from catboost>=1.0.6->darts) (0.20.1) Requirement already satisfied: python-dateutil in c:\users\yasmi\anaconda3\lib\site-packages (from holidays>=0.11.1->darts) (2.8.2) Requirement already satisfied: wheel in c:\users\yasmi\anaconda3\lib\site-packages (from lightgbm>=3.2.0->darts) (0.38.4) Requirement already satisfied: pillow>=6.2.0 in c:\users\yasmi\anaconda3\lib\site-packages (from matplotlib>=3.3.0->darts) (9.4.0) Requirement already satisfied: packaging>=20.0 in c:\users\yasmi\anaconda3\lib\site-packages (from matplotlib>=3.3.0->darts) (22.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\yasmi\anaconda3\lib\site-packages (from matplotlib>=3.3.0->darts) (1.4.4) Requirement already satisfied: cycler>=0.10 in c:\users\yasmi\anaconda3\lib\site-packages (from matplotlib>=3.3.0->darts) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\yasmi\anaconda3\lib\site-packages (from matplotlib>=3.3.0->darts) (4.25.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\yasmi\anaconda3\lib\site-packages (from matplotlib>=3.3.0->darts) (3.0.9) Requirement already satisfied: contourpy>=1.0.1 in c:\users\yasmi\anaconda3\lib\site-packages (from matplotlib>=3.3.0->darts) (1.0.5) Requirement already satisfied: pytz>=2020.1 in c:\users\yasmi\anaconda3\lib\site-packages (from pandas>=1.0.5->darts) (2022.7) Requirement already satisfied: Cython!=0.29.18,!=0.29.31,>=0.29 in c:\users\yasmi\anaconda3\lib\site-packages (from pmdarima>=1.8.0->darts) (0.29.35) Requirement already satisfied: urllib3 in c:\users\yasmi\anaconda3\lib\site-packages (from pmdarima>=1.8.0->darts) (1.26.14) Requirement already satisfied: setuptools!=50.0.0,>=38.6.0 in c:\users\yasmi\anaconda3\lib\site-packages (from pmdarima>=1.8.0->darts) (65.6.3) Requirement already satisfied: LunarCalendar>=0.0.9 in c:\users\yasmi\anaconda3\lib\site-packages (from prophet>=1.1.1->darts) (0.0.9) Requirement already satisfied: importlib-resources in c:\users\yasmi\anaconda3\lib\site-packages (from prophet>=1.1.1->darts) (5.12.0) Requirement already satisfied: cmdstanpy>=1.0.4 in c:\users\yasmi\anaconda3\lib\site-packages (from prophet>=1.1.1->darts) (1.1.0) Requirement already satisfied: convertdate>=2.1.2 in c:\users\yasmi\anaconda3\lib\site-packages (from prophet>=1.1.1->darts) (2.4.0) Requirement already satisfied: numba>=0.51 in c:\users\yasmi\anaconda3\lib\site-packages (from pyod>=0.9.5->darts) (0.56.4) Requirement already satisfied: torchmetrics>=0.7.0 in c:\users\yasmi\anaconda3\lib\site-packages (from pytorch-lightning>=1.5.0->darts) (0.11.4) Requirement already satisfied: typing-extensions>=4.0.0 in c:\users\yasmi\anaconda3\lib\site-packages (from pytorch-lightning>=1.5.0->darts) (4.4.0) Requirement already satisfied: PyYAML>=5.4 in c:\users\yasmi\anaconda3\lib\site-packages (from pytorch-lightning>=1.5.0->darts) (6.0) Requirement already satisfied: fsspec[http]>2021.06.0 in c:\users\yasmi\anaconda3\lib\site-packages (from pytorch-lightning>=1.5.0->darts) (2022.11.0) Requirement already satisfied: lightning-utilities>=0.7.0 in c:\users\yasmi\anaconda3\lib\site-packages (from pytorch-lightning>=1.5.0->darts) (0.8.0) Requirement already satisfied: idna<4,>=2.5 in c:\users\yasmi\anaconda3\lib\site-packages (from requests>=2.22.0->darts) (3.4) Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\yasmi\anaconda3\lib\site-packages (from requests>=2.22.0->darts) (2.0.4) Requirement already satisfied: certifi>=2017.4.17 in c:\users\yasmi\anaconda3\lib\site-packages (from requests>=2.22.0->darts) (2022.12.7) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\yasmi\anaconda3\lib\site-packages (from scikit-learn>=1.0.1->darts) (2.2.0) Requirement already satisfied: slicer==0.0.7 in c:\users\yasmi\anaconda3\lib\site-packages (from shap>=0.40.0->darts) (0.0.7) Requirement already satisfied: cloudpickle in c:\users\yasmi\anaconda3\lib\site-packages (from shap>=0.40.0->darts) (2.0.0) Requirement already satisfied: plotly-resampler in c:\users\yasmi\anaconda3\lib\site-packages (from statsforecast>=1.4->darts) (0.8.3.2) Requirement already satisfied: fugue>=0.8.1 in c:\users\yasmi\anaconda3\lib\site-packages (from statsforecast>=1.4->darts) (0.8.5) Requirement already satisfied: patsy>=0.5.2 in c:\users\yasmi\anaconda3\lib\site-packages (from statsmodels>=0.13.0->darts) (0.5.3) Requirement already satisfied: protobuf<4,>=3.8.0 in c:\users\yasmi\anaconda3\lib\site-packages (from tensorboardX>=2.1->darts) (3.20.3) Requirement already satisfied: colorama in c:\users\yasmi\anaconda3\lib\site-packages (from tqdm>=4.60.0->darts) (0.4.6) Requirement already satisfied: pymeeus<=1,>=0.3.13 in c:\users\yasmi\anaconda3\lib\site-packages (from convertdate>=2.1.2->prophet>=1.1.1->darts) (0.5.12) Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in c:\users\yasmi\anaconda3\lib\site-packages (from fsspec[http]>2021.06.0->pytorch-lightning>=1.5.0->darts) (3.8.4) Requirement already satisfied: adagio>=0.2.4 in c:\users\yasmi\anaconda3\lib\site-packages (from fugue>=0.8.1->statsforecast>=1.4->darts) (0.2.4) Requirement already satisfied: jinja2 in c:\users\yasmi\anaconda3\lib\site-packages (from fugue>=0.8.1->statsforecast>=1.4->darts) (3.1.2) Requirement already satisfied: fugue-sql-antlr>=0.1.6 in c:\users\yasmi\anaconda3\lib\site-packages (from fugue>=0.8.1->statsforecast>=1.4->darts) (0.1.6) Requirement already satisfied: triad>=0.9.0 in c:\users\yasmi\anaconda3\lib\site-packages (from fugue>=0.8.1->statsforecast>=1.4->darts) (0.9.0) Requirement already satisfied: sqlglot in c:\users\yasmi\anaconda3\lib\site-packages (from fugue>=0.8.1->statsforecast>=1.4->darts) (16.1.0) Requirement already satisfied: qpd>=0.4.3 in c:\users\yasmi\anaconda3\lib\site-packages (from fugue>=0.8.1->statsforecast>=1.4->darts) (0.4.3) Requirement already satisfied: pyarrow>=0.15.1 in c:\users\yasmi\anaconda3\lib\site-packages (from fugue>=0.8.1->statsforecast>=1.4->darts) (12.0.1) Requirement already satisfied: ephem>=3.7.5.3 in c:\users\yasmi\anaconda3\lib\site-packages (from LunarCalendar>=0.0.9->prophet>=1.1.1->darts) (4.1.4) Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in c:\users\yasmi\anaconda3\lib\site-packages (from numba>=0.51->pyod>=0.9.5->darts) (0.39.1) Requirement already satisfied: tenacity>=6.2.0 in c:\users\yasmi\anaconda3\lib\site-packages (from plotly->catboost>=1.0.6->darts) (8.0.1) Requirement already satisfied: trace-updater>=0.0.8 in c:\users\yasmi\anaconda3\lib\site-packages (from plotly-resampler->statsforecast>=1.4->darts) (0.0.9.1) Requirement already satisfied: dash<3.0.0,>=2.2.0 in c:\users\yasmi\anaconda3\lib\site-packages (from plotly-resampler->statsforecast>=1.4->darts) (2.10.2) Requirement already satisfied: orjson<4.0.0,>=3.8.0 in c:\users\yasmi\anaconda3\lib\site-packages (from plotly-resampler->statsforecast>=1.4->darts) (3.9.1) Requirement already satisfied: jupyter-dash>=0.4.2 in c:\users\yasmi\anaconda3\lib\site-packages (from plotly-resampler->statsforecast>=1.4->darts) (0.4.2) Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in c:\users\yasmi\anaconda3\lib\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>2021.06.0->pytorch-lightning>=1.5.0->darts) (4.0.2) Requirement already satisfied: yarl<2.0,>=1.0 in c:\users\yasmi\anaconda3\lib\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>2021.06.0->pytorch-lightning>=1.5.0->darts) (1.9.2) Requirement already satisfied: attrs>=17.3.0 in c:\users\yasmi\anaconda3\lib\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>2021.06.0->pytorch-lightning>=1.5.0->darts) (22.1.0) Requirement already satisfied: frozenlist>=1.1.1 in c:\users\yasmi\anaconda3\lib\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>2021.06.0->pytorch-lightning>=1.5.0->darts) (1.3.3) Requirement already satisfied: aiosignal>=1.1.2 in c:\users\yasmi\anaconda3\lib\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>2021.06.0->pytorch-lightning>=1.5.0->darts) (1.3.1) Requirement already satisfied: multidict<7.0,>=4.5 in c:\users\yasmi\anaconda3\lib\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>2021.06.0->pytorch-lightning>=1.5.0->darts) (6.0.4) Requirement already satisfied: dash-core-components==2.0.0 in c:\users\yasmi\anaconda3\lib\site-packages (from dash<3.0.0,>=2.2.0->plotly-resampler->statsforecast>=1.4->darts) (2.0.0) Requirement already satisfied: Flask<2.3.0,>=1.0.4 in c:\users\yasmi\anaconda3\lib\site-packages (from dash<3.0.0,>=2.2.0->plotly-resampler->statsforecast>=1.4->darts) (2.2.2) Requirement already satisfied: dash-table==5.0.0 in c:\users\yasmi\anaconda3\lib\site-packages (from dash<3.0.0,>=2.2.0->plotly-resampler->statsforecast>=1.4->darts) (5.0.0) Requirement already satisfied: Werkzeug<2.3.0 in c:\users\yasmi\anaconda3\lib\site-packages (from dash<3.0.0,>=2.2.0->plotly-resampler->statsforecast>=1.4->darts) (2.2.2) Requirement already satisfied: dash-html-components==2.0.0 in c:\users\yasmi\anaconda3\lib\site-packages (from dash<3.0.0,>=2.2.0->plotly-resampler->statsforecast>=1.4->darts) (2.0.0) Requirement already satisfied: antlr4-python3-runtime<4.12,>=4.11.1 in c:\users\yasmi\anaconda3\lib\site-packages (from fugue-sql-antlr>=0.1.6->fugue>=0.8.1->statsforecast>=1.4->darts) (4.11.1) Requirement already satisfied: ipykernel in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (6.19.2) Requirement already satisfied: nest-asyncio in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (1.5.6) Requirement already satisfied: ansi2html in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (1.8.0) Requirement already satisfied: retrying in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (1.3.4) Requirement already satisfied: ipython in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (8.10.0) Requirement already satisfied: fs in c:\users\yasmi\anaconda3\lib\site-packages (from triad>=0.9.0->fugue>=0.8.1->statsforecast>=1.4->darts) (2.4.16) Requirement already satisfied: MarkupSafe>=2.0 in c:\users\yasmi\anaconda3\lib\site-packages (from jinja2->fugue>=0.8.1->statsforecast>=1.4->darts) (2.1.1) Requirement already satisfied: click>=8.0 in c:\users\yasmi\anaconda3\lib\site-packages (from Flask<2.3.0,>=1.0.4->dash<3.0.0,>=2.2.0->plotly-resampler->statsforecast>=1.4->darts) (8.0.4) Requirement already satisfied: itsdangerous>=2.0 in c:\users\yasmi\anaconda3\lib\site-packages (from Flask<2.3.0,>=1.0.4->dash<3.0.0,>=2.2.0->plotly-resampler->statsforecast>=1.4->darts) (2.0.1) Requirement already satisfied: appdirs~=1.4.3 in c:\users\yasmi\anaconda3\lib\site-packages (from fs->triad>=0.9.0->fugue>=0.8.1->statsforecast>=1.4->darts) (1.4.4) Requirement already satisfied: psutil in c:\users\yasmi\anaconda3\lib\site-packages (from ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (5.9.0) Requirement already satisfied: debugpy>=1.0 in c:\users\yasmi\anaconda3\lib\site-packages (from ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (1.5.1) Requirement already satisfied: tornado>=6.1 in c:\users\yasmi\anaconda3\lib\site-packages (from ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (6.1) Requirement already satisfied: comm>=0.1.1 in c:\users\yasmi\anaconda3\lib\site-packages (from ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.1.2) Requirement already satisfied: pyzmq>=17 in c:\users\yasmi\anaconda3\lib\site-packages (from ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (23.2.0) Requirement already satisfied: matplotlib-inline>=0.1 in c:\users\yasmi\anaconda3\lib\site-packages (from ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.1.6) Requirement already satisfied: traitlets>=5.4.0 in c:\users\yasmi\anaconda3\lib\site-packages (from ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (5.7.1) Requirement already satisfied: jupyter-client>=6.1.12 in c:\users\yasmi\anaconda3\lib\site-packages (from ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (7.3.4) Requirement already satisfied: backcall in c:\users\yasmi\anaconda3\lib\site-packages (from ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.2.0) Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.30 in c:\users\yasmi\anaconda3\lib\site-packages (from ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (3.0.36) Requirement already satisfied: pickleshare in c:\users\yasmi\anaconda3\lib\site-packages (from ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.7.5) Requirement already satisfied: decorator in c:\users\yasmi\anaconda3\lib\site-packages (from ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (5.1.1) Requirement already satisfied: jedi>=0.16 in c:\users\yasmi\anaconda3\lib\site-packages (from ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.18.1) Requirement already satisfied: stack-data in c:\users\yasmi\anaconda3\lib\site-packages (from ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.2.0) Requirement already satisfied: pygments>=2.4.0 in c:\users\yasmi\anaconda3\lib\site-packages (from ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (2.11.2) Requirement already satisfied: parso<0.9.0,>=0.8.0 in c:\users\yasmi\anaconda3\lib\site-packages (from jedi>=0.16->ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.8.3) Requirement already satisfied: jupyter-core>=4.9.2 in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-client>=6.1.12->ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (5.2.0) Requirement already satisfied: entrypoints in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-client>=6.1.12->ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.4) Requirement already satisfied: wcwidth in c:\users\yasmi\anaconda3\lib\site-packages (from prompt-toolkit<3.1.0,>=3.0.30->ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.2.5) Requirement already satisfied: executing in c:\users\yasmi\anaconda3\lib\site-packages (from stack-data->ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.8.3) Requirement already satisfied: asttokens in c:\users\yasmi\anaconda3\lib\site-packages (from stack-data->ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (2.0.5) Requirement already satisfied: pure-eval in c:\users\yasmi\anaconda3\lib\site-packages (from stack-data->ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.2.2) Requirement already satisfied: pywin32>=1.0 in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-core>=4.9.2->jupyter-client>=6.1.12->ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (305.1) Requirement already satisfied: platformdirs>=2.5 in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-core>=4.9.2->jupyter-client>=6.1.12->ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (2.5.2)
def plot(df,x_feature_name,y_feature_name,title):
"""
This function takes two dataframes as input and plots the number of calls per day and per week.
Args:
daily_df (pandas.DataFrame): A dataframe containing daily call data.
weekly_df (pandas.DataFrame): A dataframe containing weekly call data.
Returns:
None
"""
# Create a subplot with two rows and one column
# fig = make_subplots(rows=2, cols=1)
fig = go.Figure()
# Add a trace for daily calls
fig.add_trace(
go.Scatter(
x=df[x_feature_name],
y=df[y_feature_name],
name=y_feature_name,
mode='lines+markers'
))
# Update xaxis properties
fig.update_xaxes(title_text='Date')
# Update yaxis properties
fig.update_yaxes(title_text=y_feature_name)
# Update title and height
fig.update_layout(
title=f'{title}',
height=500,
width=1200
)
# Show the plot
fig.show()
# Write the plot to an HTML file
# fig.write_html(f'Visualization/btc.html')
def train_test_predicted_plot(df_train,
df_test,
x_feature ,
y_feature,
predicted,
model_name):
"""
Plots the training data, actual values, and forecasted values using Plotly.
Args:
train (pd.Series): The training data.
test (pd.Series): The actual values.
predicted (pd.Series): The forecasted values.
model_name (str): The name of the forecasting model.
Returns:
None
"""
# Create a subplot with two rows and one column
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=df_train[x_feature],
y=df_train[y_feature],
name='Training Data',
mode='lines+markers'
))
# Add a trace for actual values
fig.add_trace(
go.Scatter(
x=df_test[x_feature],
y=df_test[y_feature],
name='Actual Values',
mode='lines+markers'
)
)
# Add a trace for forecasted values
fig.add_trace(
go.Scatter(
x=predicted[x_feature],
y=predicted[y_feature],
name=f'{model_name}',
mode='lines+markers'
)
)
# Update xaxis properties
fig.update_xaxes(title_text='Time')
# Update yaxis properties
fig.update_yaxes(title_text=y_feature)
# Update title and height
fig.update_layout(
title=f'Forecasting using {model_name}',
height=500,
width=1500
)
# Save the plot as an HTML file
fig.show()
# fig.write_html(f'Visualization/forecasting_using_{model_name}'+'.html')
# fig.write_imag
df = pd.read_csv('dataset.csv')
df['date'] = pd.to_datetime(df['date'])
display(df)
| Unnamed: 0.1 | Unnamed: 0 | date | sleep_hours | |
|---|---|---|---|---|
| 0 | 0 | 0 | 2015-02-19 | 6.400000 |
| 1 | 1 | 1 | 2015-02-20 | 7.583333 |
| 2 | 2 | 2 | 2015-02-21 | 6.350000 |
| 3 | 3 | 3 | 2015-02-22 | 6.500000 |
| 4 | 4 | 4 | 2015-02-23 | 8.916667 |
| ... | ... | ... | ... | ... |
| 2349 | 2349 | 2349 | 2021-12-25 | 7.933333 |
| 2350 | 2350 | 2350 | 2021-12-26 | 3.850000 |
| 2351 | 2351 | 2351 | 2021-12-29 | 6.175000 |
| 2352 | 2352 | 2352 | 2021-12-30 | 5.158333 |
| 2353 | 2353 | 2353 | 2021-12-31 | 5.908333 |
2354 rows × 4 columns
df.to_csv('dataset.csv')
plt.subplots(figsize=(10,10))
boxplot1 = df.boxplot(column=['sleep_hours'], grid=False, rot=45, fontsize=12)
plt.show()
numerical_cols = ['sleep_hours']
# Calculate z-scores for each numerical column
z_scores = np.abs((df[numerical_cols] - df[numerical_cols].mean()) / df[numerical_cols].std())
# Set the threshold for outlier detection (e.g., z-score > 3)
outlier_threshold = 14
# Identify rows with outliers
outlier_rows = z_scores.apply(lambda row: any(row > outlier_threshold), axis=1)
# Remove rows with outliers from the DataFrame
df_cleaned = df[~outlier_rows]
# Save the cleaned DataFrame to a new file
df.to_csv('cleaned_dataset.csv', index=False)
print(df)
Unnamed: 0.1 Unnamed: 0 date sleep_hours 0 0 0 2015-02-19 6.400000 1 1 1 2015-02-20 7.583333 2 2 2 2015-02-21 6.350000 3 3 3 2015-02-22 6.500000 4 4 4 2015-02-23 8.916667 ... ... ... ... ... 2349 2349 2349 2021-12-25 7.933333 2350 2350 2350 2021-12-26 3.850000 2351 2351 2351 2021-12-29 6.175000 2352 2352 2352 2021-12-30 5.158333 2353 2353 2353 2021-12-31 5.908333 [2354 rows x 4 columns]
# This line prints the shape of the DataFrame df using the shape attribute.
# The shape attribute returns a tuple representing the dimensions of the DataFrame,
# with the number of rows and columns
df = pd.read_csv('cleaned_dataset.csv')
print('Shape of the Data ',df.shape)
print('\n')
# These lines print the statistics report of the training data stored in the DataFrame df.
# The describe() function calculates various summary statistics of the numerical columns in the DataFrame,
# such as count, mean, standard deviation, minimum value, and quartiles. The comment is incomplete
print('Statistics Report of Data')
print(df.describe())
Shape of the Data (2354, 3)
Statistics Report of Data
Unnamed: 0 sleep_hours
count 2354.000000 2354.000000
mean 1176.500000 7.356560
std 679.685589 2.213308
min 0.000000 1.266667
25% 588.250000 6.235417
50% 1176.500000 6.816667
75% 1764.750000 7.483333
max 2353.000000 17.433333
# This line converts the 'date' column in the DataFrame df to datetime
# format using the pd.to_datetime() function.
# This is done to ensure that the 'date' column is recognized
# and processed as dates for further analysis
df['date'] = pd.to_datetime(df['date'])
# This line creates a new pandas DatetimeIndex object called complete_dates using the pd.date_range() function.
# It generates a range of dates starting from the minimum date in the 'date' column of the DataFrame
# df to the maximum date, with a frequency of one day (freq='D').
# This will be used to create a complete sequence of dates
complete_dates = pd.date_range(start=df['date'].min() ,end=df['date'].max(),freq='D' )
# This line creates a new DataFrame called completed_dates_df with a single column named 'date' using the pd.DataFrame() function.
completed_dates_df = pd.DataFrame({'date':complete_dates})
# This line merges the completed_dates_df DataFrame with the original DataFrame df based on the 'date' column.
# It performs a left join (how='left'), which means that all the dates from completed_dates_df are included,
# and the corresponding data from df is merged where available.
merged_df = pd.merge(completed_dates_df,df,on='date',how='left')
# This line creates a new DataFrame called missing_days by filtering the merged_df_train DataFrame.
# It selects only the rows where the 'sleep_hours' column has missing values (NaN)
missing_days = merged_df[merged_df['sleep_hours'].isnull()]
print('Missing Values in days:\n',missing_days.shape[0])
print('Missing Day or Index')
display(missing_days)
Missing Values in days: 154 Missing Day or Index
| date | Unnamed: 0.1 | Unnamed: 0 | sleep_hours | |
|---|---|---|---|---|
| 14 | 2015-03-05 | NaN | NaN | NaN |
| 15 | 2015-03-06 | NaN | NaN | NaN |
| 16 | 2015-03-07 | NaN | NaN | NaN |
| 18 | 2015-03-09 | NaN | NaN | NaN |
| 22 | 2015-03-13 | NaN | NaN | NaN |
| ... | ... | ... | ... | ... |
| 2390 | 2021-09-05 | NaN | NaN | NaN |
| 2399 | 2021-09-14 | NaN | NaN | NaN |
| 2469 | 2021-11-23 | NaN | NaN | NaN |
| 2503 | 2021-12-27 | NaN | NaN | NaN |
| 2504 | 2021-12-28 | NaN | NaN | NaN |
154 rows × 4 columns
# Visualize the missing days
# This line creates a new figure for the plot with a specified size of 20 units in width and 4 units in height
plt.figure(figsize=(20, 4))
# This line plots the data on the created figure. It uses the plot() function from matplotlib.pyplot to
# plot the availability of sleep data (merged_df_train['sleep_hours'].notnull())
# against the index of the merged_df_train DataFrame (merged_df_train.index).
# The markers are set to 'o' (circle), the linestyle is set to '-' (solid line),
# and the linewidth is set to 0.5. This line essentially visualizes the availability of sleep data for each day
plt.plot(merged_df.index, merged_df['sleep_hours'].notnull(), marker='o', linestyle='-', linewidth=0.5)
# These lines add a title to the plot as "Missing Days", set the label for the x-axis as "Date",
# set the label for the y-axis as "Availability", and enable grid lines on the plot
plt.title('Missing Days')
plt.xlabel('Date')
plt.ylabel('Availability')
plt.grid(True)
# Show the plot
plt.show()
# Summary :
# The code visualizes the availability of sleep data for each day by plotting a graph. It uses the plot() function from matplotlib.pyplot
# to create a line plot where the x-axis represents the dates and the y-axis represents the availability of sleep data.
# The plot shows markers for the presence or absence of sleep data on each day.
# The resulting visualization provides a quick overview of the missing days where sleep data is not available
# Fill missing values using linear interpolation
# This line fills the missing values in the 'sleep_hours' column of the DataFrame merged_df using linear interpolation.
# The interpolate() function is applied to the 'sleep_hours' column, and the method='nearest' parameter specifies that
# the nearest non-null value should be used to fill the missing values
merged_df['sleep_hours'] = merged_df['sleep_hours'].interpolate(method='linear')
# Calculate the number of missing days
# These lines calculate the number of missing days in the 'sleep_hours' column. The isnull() function is used to
# identify the missing values, and sum() calculates the total count of missing values. The result is stored in
# the variable missing_days, and then printed with an accompanying message.
missing_days = merged_df['sleep_hours'].isnull().sum()
print('Number of missing days:', missing_days)
# Visualize the filled data
plt.figure(figsize=(25, 4))
plt.plot(merged_df['date'], merged_df['sleep_hours'], marker='o', linestyle='-', linewidth=2)
plt.title('Sleep Hours by Date (Interpolated)')
plt.xlabel('Date')
plt.ylabel('Sleep Hours')
plt.grid(True)
# Show the plot
plt.show()
# Summary of the code:
# The code fills the missing values in the 'sleep_hours' column of the DataFrame using linear interpolation.
# It then calculates the number of missing days and prints the count. After that, it visualizes the filled data
# by creating a line plot with dates on the x-axis and sleep hours on the y-axis. The plot shows the interpolated
# values and provides an overview of the sleep hours by date.
Number of missing days: 0
# This line imports the Plotly Express module, which provides a high-level interface for creating interactive plots.
import plotly.express as px
# This line creates a histogram figure using the px.histogram() function from Plotly Express.
# It takes the DataFrame merged_df as input and specifies that the 'sleep_hours' column should be used as the x-axis variable.
# The title parameter sets the title of the plot to 'Distribution of the Training Data', and nbins=10 specifies the number of bins in the histogram
fig = px.histogram(merged_df, x='sleep_hours', title='Distribution of the Training Data', nbins=10)
# This line updates the visual properties of the histogram bars. It sets the marker_color to '#636EFA'
# (a shade of blue), marker_line_color to 'white', and marker_line_width to 0.5. This customization gives the bars a consistent appearance.
fig.update_traces(marker_color='#636EFA', marker_line_color='white', marker_line_width=0.5)
# This block of code updates the layout of the figure. It sets the x-axis title to 'Number of Hours',
# the y-axis title to 'Frequency', removes the legend, sets the plot background color to '#f2f2f2', and updates
# the font settings to use the Arial font, size 12, and color '#333333' for the text
fig.update_layout(
xaxis_title='Number of Hours',
yaxis_title='Frequency',
showlegend=False,
plot_bgcolor='#f2f2f2',
font=dict(
family='Arial',
size=12,
color='#333333'
)
)
# These lines update the grid properties of the x-axis and y-axis, respectively.
# They set showgrid to True to display the grid lines, gridwidth to 0.5 to control
# the thickness of the grid lines, and gridcolor to 'lightgray' to set the color of the grid lines
fig.update_xaxes(showgrid=True, gridwidth=0.5, gridcolor='lightgray')
fig.update_yaxes(showgrid=True, gridwidth=0.5, gridcolor='lightgray')
fig.show()
import plotly.graph_objects as go
# Creating the scatter plot
# This line creates an empty figure object using the go.Figure() constructor. The figure object will be used to add traces and customize the plot
fig = go.Figure()
# Adding scatter trace
# This line adds a scatter trace to the figure. It uses the go.Scatter() constructor from Plotly graph objects.
# The x and y parameters specify the data for the x-axis and y-axis, respectively. The mode parameter is set to
# 'markers' to display individual data points. The name parameter sets the name of the trace, which will be used in the legend.
fig.add_trace(go.Scatter(x=merged_df['date'], y=merged_df['sleep_hours'], mode='markers', name='Sleep Hours'))
# Adding line trace
# This line adds a second scatter trace to the figure. It has the same data as the previous trace but the mode parameter is set to 'lines' to create a line plot instead of markers. This trace represents the trend of sleep hours over time
fig.add_trace(go.Scatter(x=merged_df['date'], y=merged_df['sleep_hours'], mode='lines', name='Sleep Hours Trend'))
# Customizing the axes labels
# This line updates the layout of the figure by setting the x-axis title to 'Date' and the y-axis title to 'Sleep Hours'. It customizes the axes labels.
fig.update_layout(xaxis_title='Date', yaxis_title='Sleep Hours')
# Customizing the grid lines
# These lines further customize the layout by adding grid lines to the x-axis and y-axis.
# They set showgrid to True to display the grid lines, gridwidth to 0.5 to control the
# thickness of the grid lines, and gridcolor to 'lightgray' to set the color of the grid lines
fig.update_layout(xaxis=dict(showgrid=True, gridwidth=0.5, gridcolor='lightgray'),
yaxis=dict(showgrid=True, gridwidth=0.5, gridcolor='lightgray'))
# Setting the title
fig.update_layout(title='Data : Sleep Hours by Date')
# Displaying the plot
fig.show()
# Summary :
# The code uses Plotly graph objects to create a scatter plot of the sleep hours by date.
# It adds two scatter traces, one with markers representing individual sleep hour values
# and another with a line representing the trend. It customizes the axes labels, adds grid lines, sets the title,
# and displays the plot. The final plot will show the sleep hours data with markers and a trend line,
# along with customized labels, grid lines, and title for a visually appealing and informative visualization
df_indexed = merged_df.set_index('date')
# These lines perform resampling on the 'sleep_hours' column of the df_indexed DataFrame.
# They aggregate the data into 36-hour and 48-hour intervals and calculate the sum of 'sleep_hours' within each interval.
# The resulting resampled data is stored in the df_36_hourly and df_48_hourly
df_36_hourly = df_indexed['sleep_hours'].resample('36h').sum().reset_index()
df_48_hourly = df_indexed['sleep_hours'].resample('48h').sum().reset_index()
fig = go.Figure()
# Add the 12-hourly line plot
fig.add_trace(go.Scatter(x=df_36_hourly['date'], y=df_36_hourly['sleep_hours'], mode='lines', name='36-Hourly'))
fig.add_trace(go.Scatter(x=df_36_hourly['date'], y=df_36_hourly['sleep_hours'], mode='markers', name='36-Hourly'))
fig.add_trace(go.Scatter(x=df_48_hourly['date'], y=df_48_hourly['sleep_hours'], mode='lines', name='48-Hourly'))
fig.add_trace(go.Scatter(x=df_48_hourly['date'], y=df_48_hourly['sleep_hours'], mode='markers', name='48-Hourly'))
# Customize the axes labels
fig.update_layout(xaxis_title='Date', yaxis_title='Sleep Hours')
# Customize the grid lines
fig.update_layout(xaxis=dict(showgrid=True, gridwidth=0.5, gridcolor='lightgray'),
yaxis=dict(showgrid=True, gridwidth=0.5, gridcolor='lightgray'))
# Set the title
fig.update_layout(title='Sleep Hours by Date')
# Display the plot
fig.show()
# summary:
# The code uses Plotly graph objects to create a plot showing the sleep hours by date at 36-hour and 48-hour intervals.
# It adds scatter traces for both the line plot and the individual data points for each interval.
# The axes labels, grid lines, and title are customized. The final plot will show the sleep hours
# at different intervals, with line plots and markers, along with customized labels, grid lines,
# and title for an informative visualization of the sleep hour trends over time.
Box-Jenkins Framework
The Box-Jenkins method is a statistical technique used for time series analysis and forecasting. The approach starts with the assumption that the process that generated the time series can be approximated using an ARMA model if it is stationary or an ARIMA model if it is non-stationary.
The Box-Jenkins method applies autoregressive moving average (ARMA) or autoregressive integrated moving average (ARIMA) models to find the best fit of a time-series model to past values of a time series2. The model can analyze several different types of time series data for forecasting purposes3.
ARIMA Model Pipeline
Stationary Test
What is stationary Data ?
Stationary data refers to time series data that mean and variance do not vary across time. The data is considered non-stationary if there is a strong trend or seasonality observed from the data.
Why we need stationary Data for ARIMA Model ARIMA models rely on the assumption that the time series being modeled is stationary. Therefore that assumption needs to hold if you want to use these models. The ARIMA model uses differenced data to make the data stationary, which means there’s a consistency of the data over time. This function removes the effect of trends or seasonality, such as market or economic data. We make the data stationary only in case of ARIMA because the ARIMA model looks at the past data to predict future values.
def adfuller_test(values):
result=adfuller(values)
labels = ['ADF Test Statistic','p-value','#Lags Used','Number of Observations Used']
for value,label in zip(result,labels):
print(label+' : '+str(value) )
if result[1] <= 0.05:
print("P value is less than 0.05 that means we can reject the null hypothesis(Ho). Therefore we can conclude that data has no unit root and is stationary")
else:
print("Weak evidence against null hypothesis that means time series has a unit root which indicates that it is non-stationary ")
adfuller_test(merged_df['sleep_hours'])
ADF Test Statistic : -2.411101199370657 p-value : 0.1386276602953715 #Lags Used : 22 Number of Observations Used : 2485 Weak evidence against null hypothesis that means time series has a unit root which indicates that it is non-stationary
time_series_daily = TimeSeries.from_dataframe(merged_df,'date','sleep_hours',freq='D' ,fill_missing_dates=True,fillna_value=True)
train , test = time_series_daily.split_after(0.80)
print('Shape of train set : ',train.pd_dataframe().shape)
print('Shape of test set : ',test.pd_dataframe().shape)
Horizan = test.pd_dataframe().shape[0]
Shape of train set : (2006, 1) Shape of test set : (502, 1)
from darts.utils.statistics import plot_acf, check_seasonality
for m in range(2, 25):
is_seasonal, period = check_seasonality(time_series_daily, m=m, alpha=0.05)
if is_seasonal:
print("There is seasonality of order {}.".format(period))
There is seasonality of order 10. There is seasonality of order 14. There is seasonality of order 17. There is seasonality of order 20.
The autocorrelation function (ACF) is used to identify the order of ARIMA models. The ACF plot shows the correlation between the time series and its lagged version. The lag at which the ACF plot crosses the upper confidence interval for the first time is considered as the order of the MA component of the ARIMA model. Similarly, if the ACF plot decays slowly, it indicates that there is a high degree of autocorrelation in the time series, which means that an AR component should be included in the ARIMA model.
from darts.utils.statistics import plot_acf,plot_pacf
plot_acf(time_series_daily.diff(1), m=12, max_lag=100, fig_size=(10, 5), axis=None, default_formatting=True)
plt.xlabel('lags')
plt.ylabel('correlation')
plt.title('Auto Correlation Plot')
plt.show()
The partial autocorrelation function (PACF) is also used to identify the order of ARIMA models. The PACF plot shows the correlation between the time series and its lagged version, but with the influence of the intermediate lags removed. The lag at which the PACF plot crosses the upper confidence interval for the first time is considered as the order of the AR component of the ARIMA model.
from darts.utils.statistics import plot_acf,plot_pacf
plot_pacf(time_series_daily, m=7, max_lag=100, fig_size=(10, 5), axis=None, default_formatting=True)
plt.xlabel('lags')
plt.ylabel('correlation')
plt.title('Partial Auto Correlation Plot')
plt.show()
from darts.models.forecasting.arima import ARIMA
# these parameters has been found using correlation plot and partial correlation plots
# i added the description there please read that paragraph # and we can also test
# other order by try and test method
arima_model = ARIMA(p=2 , #, for Auto regressive parameter
d=1 , # for difference to make the data is statioanry
q=3 , # for the moving Average,
seasonal_order=(3, 1, 3, 7)
)
arima_model.fit(train)
ARIMA(p=2, d=1, q=3, seasonal_order=(3, 1, 3, 7), trend=None, random_state=0, add_encoders=None)
predictions = arima_model.predict(Horizan)
display(predictions)
<TimeSeries (DataArray) (date: 502, component: 1, sample: 1)>
array([[[5.9193949 ]],
[[5.88743747]],
[[6.1192225 ]],
[[6.30209793]],
[[6.3121213 ]],
[[6.39180562]],
[[6.32902015]],
[[6.0761223 ]],
[[6.14630629]],
[[6.0773535 ]],
...
[[5.93558361]],
[[6.11248137]],
[[6.07352592]],
[[6.3344708 ]],
[[6.21043021]],
[[5.79617676]],
[[5.88247547]],
[[5.94634898]],
[[6.08371473]],
[[6.10547089]]])
Coordinates:
* date (date) datetime64[ns] 2020-08-17 2020-08-18 ... 2021-12-31
* component (component) object 'sleep_hours'
Dimensions without coordinates: sample
Attributes:
static_covariates: None
hierarchy: None# Convert train_series into a pandas dataframe and reset index
df_train = train.pd_dataframe().reset_index()
# Convert test_series into a pandas dataframe and reset index
df_test = test.pd_dataframe().reset_index()
# Convert prediction into a pandas dataframe and reset index
forecast = predictions.pd_dataframe().reset_index()
x_feature ='date'
y_feature='sleep_hours'
model_name = 'Arima Prediction'
train_test_predicted_plot(df_train,df_test,x_feature,y_feature,forecast,'ARIMA-Prediction')
Suppose you have the following true and predicted time series data:
y_true = [1, 2, 3, 4, 5] y_pred = [1.2, 2.3, 3.4, 4.5, 5.6] To calculate the mean squared error (MSE) between y_true and y_pred, you would use the following formula:
In this case, the MSE would be:
MSE = (1/5) * ((1-1.2)^2 + (2-2.3)^2 + (3-3.4)^2 + (4-4.5)^2 + (5-5.6)^2) = 0.26 To calculate the mean absolute percentage error (MAPE) between y_true and y_pred, you would use the following formula:
MAPE = (1/n) * sum(abs((y_true_i - y_pred_i)/y_true_i)) In this case, the MAPE would be:
MAPE = (1/5) * (abs((1-1.2)/1) + abs((2-2.3)/2) + abs((3-3.4)/3) + abs((4-4.5)/4) + abs((5-5.6)/5)) = 0.083 The MSE measures the average squared difference between the predicted and true values in a time series. It is a measure of how well your model fits the data.
The MAPE measures the average percentage difference between the predicted and true values in a time series. It is a measure of how well your model predicts future values.
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np
def Evaluations_metrics(y_true,y_pred):
# y_true and y_pred are your true and predicted time series data
mse_value = mean_squared_error(y_true, y_pred)
mape_value = mean_absolute_percentage_error(y_true, y_pred)*100
mae_value = mean_absolute_error(y_true, y_pred)
rmse_value = np.sqrt(mean_squared_error(y_true, y_pred))
print('Mean Sqaured Error(MSE) : ',mse_value)
print('Mean absolute Percentage Error (MAPE)(percentage Error) : ',mape_value)
print('Mean Absolute Error : ',mae_value)
print('Root Mean Sqaure Error :',rmse_value)
return mse_value , mape_value ,mae_value ,rmse_value
mse_value , mape_value ,mae_value ,rmse_value = Evaluations_metrics(df_test['sleep_hours'].tolist() ,forecast['sleep_hours'].tolist() )
Mean Sqaured Error(MSE) : 1.2709446975173189 Mean absolute Percentage Error (MAPE)(percentage Error) : 13.31269939637065 Mean Absolute Error : 0.8885399983122241 Root Mean Sqaure Error : 1.1273618307878437
!pip install nbconvert
Requirement already satisfied: nbconvert in c:\users\yasmi\anaconda3\lib\site-packages (6.5.4) Requirement already satisfied: traitlets>=5.0 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (5.7.1) Requirement already satisfied: nbclient>=0.5.0 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (0.5.13) Requirement already satisfied: defusedxml in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (0.7.1) Requirement already satisfied: pandocfilters>=1.4.1 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (1.5.0) Requirement already satisfied: mistune<2,>=0.8.1 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (0.8.4) Requirement already satisfied: pygments>=2.4.1 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (2.11.2) Requirement already satisfied: nbformat>=5.1 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (5.7.0) Requirement already satisfied: packaging in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (22.0) Requirement already satisfied: lxml in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (4.9.1) Requirement already satisfied: jupyter-core>=4.7 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (5.2.0) Requirement already satisfied: tinycss2 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (1.2.1) Requirement already satisfied: entrypoints>=0.2.2 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (0.4) Requirement already satisfied: MarkupSafe>=2.0 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (2.1.1) Requirement already satisfied: jupyterlab-pygments in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (0.1.2) Requirement already satisfied: beautifulsoup4 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (4.11.1) Requirement already satisfied: jinja2>=3.0 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (3.1.2) Requirement already satisfied: bleach in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (4.1.0) Requirement already satisfied: platformdirs>=2.5 in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-core>=4.7->nbconvert) (2.5.2) Requirement already satisfied: pywin32>=1.0 in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-core>=4.7->nbconvert) (305.1) Requirement already satisfied: jupyter-client>=6.1.5 in c:\users\yasmi\anaconda3\lib\site-packages (from nbclient>=0.5.0->nbconvert) (7.3.4) Requirement already satisfied: nest-asyncio in c:\users\yasmi\anaconda3\lib\site-packages (from nbclient>=0.5.0->nbconvert) (1.5.6) Requirement already satisfied: jsonschema>=2.6 in c:\users\yasmi\anaconda3\lib\site-packages (from nbformat>=5.1->nbconvert) (4.17.3) Requirement already satisfied: fastjsonschema in c:\users\yasmi\anaconda3\lib\site-packages (from nbformat>=5.1->nbconvert) (2.16.2) Requirement already satisfied: soupsieve>1.2 in c:\users\yasmi\anaconda3\lib\site-packages (from beautifulsoup4->nbconvert) (2.3.2.post1) Requirement already satisfied: six>=1.9.0 in c:\users\yasmi\anaconda3\lib\site-packages (from bleach->nbconvert) (1.16.0) Requirement already satisfied: webencodings in c:\users\yasmi\anaconda3\lib\site-packages (from bleach->nbconvert) (0.5.1) Requirement already satisfied: attrs>=17.4.0 in c:\users\yasmi\anaconda3\lib\site-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (22.1.0) Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in c:\users\yasmi\anaconda3\lib\site-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (0.18.0) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-client>=6.1.5->nbclient>=0.5.0->nbconvert) (2.8.2) Requirement already satisfied: pyzmq>=23.0 in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-client>=6.1.5->nbclient>=0.5.0->nbconvert) (23.2.0) Requirement already satisfied: tornado>=6.0 in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-client>=6.1.5->nbclient>=0.5.0->nbconvert) (6.1)